library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
airbnb_palette <- c("#FF5A5F", "#00A699", "#767676", "#484848", "#FFB400")
main_uncleaned_data <- read_csv("airbnb1.csv")
Rows: 1010 Columns: 25── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): property_type, room_type, cancellation_policy
dbl (22): accommodates, bathrooms, bedrooms, beds, amenities_count, minimum_nights, maximum_nights, number_of_reviews, review_...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(main_uncleaned_data)
summary(main_uncleaned_data)
property_type room_type accommodates bathrooms bedrooms beds amenities_count
Length:1010 Length:1010 Min. :1.000 Min. :1.000 Min. : 1.000 Min. :1.000 Min. : 5.0
Class :character Class :character 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.:16.0
Mode :character Mode :character Median :5.000 Median :3.000 Median : 3.000 Median :3.000 Median :28.0
Mean :4.979 Mean :2.554 Mean : 3.235 Mean :3.045 Mean :27.7
3rd Qu.:7.000 3rd Qu.:4.000 3rd Qu.: 4.000 3rd Qu.:4.000 3rd Qu.:39.0
Max. :9.000 Max. :4.000 Max. :92.000 Max. :6.000 Max. :49.0
NA's :5 NA's :5 NA's :5 NA's :5 NA's :5
minimum_nights maximum_nights number_of_reviews review_scores_rating reviews_per_month availability_30 availability_60
Min. :1.000 Min. : 30.0 Min. : 1.000 Min. :1.000 Min. :0.000 Min. : 0.00 Min. : 0.00
1st Qu.:3.000 1st Qu.: 308.0 1st Qu.: 8.000 1st Qu.:2.000 1st Qu.:1.300 1st Qu.: 7.00 1st Qu.:16.00
Median :5.000 Median : 599.0 Median :10.000 Median :3.000 Median :2.600 Median :16.00 Median :30.00
Mean :5.026 Mean : 589.1 Mean : 9.946 Mean :2.966 Mean :2.534 Mean :15.25 Mean :30.54
3rd Qu.:7.000 3rd Qu.: 870.0 3rd Qu.:12.000 3rd Qu.:4.000 3rd Qu.:3.800 3rd Qu.:24.00 3rd Qu.:46.00
Max. :9.000 Max. :1124.0 Max. :25.000 Max. :5.000 Max. :5.000 Max. :30.00 Max. :60.00
NA's :5 NA's :5 NA's :5 NA's :5 NA's :5 NA's :5 NA's :5
availability_90 availability_365 instant_bookable host_identity_verified host_since cancellation_policy latitude
Min. : 0.00 Min. : 0.0 Min. :0.0000 Min. :0.000 Min. : 31 Length:1010 Min. :40.50
1st Qu.:23.00 1st Qu.: 99.0 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.: 842 Class :character 1st Qu.:40.60
Median :44.00 Median :183.0 Median :0.0000 Median :1.000 Median :1771 Mode :character Median :40.70
Mean :44.38 Mean :184.5 Mean :0.4836 Mean :0.798 Mean :1791 Mean :40.70
3rd Qu.:67.00 3rd Qu.:271.0 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:2773 3rd Qu.:40.81
Max. :90.00 Max. :365.0 Max. :1.0000 Max. :1.000 Max. :3649 Max. :40.90
NA's :5 NA's :5 NA's :5 NA's :5 NA's :5 NA's :5
longitude guests_included extra_people price
Min. :-74.00 Min. :1.00 Min. : 0.00 Min. :-304.0
1st Qu.:-73.94 1st Qu.:1.00 1st Qu.:13.00 1st Qu.: 271.0
Median :-73.86 Median :2.00 Median :25.00 Median : 318.0
Mean :-73.86 Mean :2.48 Mean :25.14 Mean : 316.6
3rd Qu.:-73.78 3rd Qu.:4.00 3rd Qu.:38.00 3rd Qu.: 363.0
Max. :-73.70 Max. :4.00 Max. :49.00 Max. : 566.0
NA's :5 NA's :5 NA's :6 NA's :5
glimpse(main_uncleaned_data)
Rows: 1,010
Columns: 25
$ property_type <chr> "Apartment", "Townhouse", "Apartment", "Apartment", "House", "Loft", "House", "House", "Apartment…
$ room_type <chr> "Entire home/apt", "Entire home/apt", "Entire home/apt", "Entire home/apt", "Entire home/apt", "P…
$ accommodates <dbl> 6, 2, 2, 4, 8, 5, 3, 2, 5, 3, 5, 3, 9, 4, 4, 4, 1, 1, 2, 4, 8, 8, 2, 5, 3, 3, 4, 4, 3, 3, 2, 5, 2…
$ bathrooms <dbl> 1, 4, 1, 4, 1, 3, 1, 3, 4, 2, 4, 3, 1, 4, 4, 2, 2, 2, 1, 1, 3, 3, 4, 1, 1, 4, 4, 1, 1, 1, 2, 1, 2…
$ bedrooms <dbl> 3, 1, 4, 2, 2, 2, 1, 5, 2, 4, 3, NA, 3, 1, 5, 4, 4, 5, 5, 4, 5, 3, 3, 5, 4, 1, 2, 2, 2, 2, 2, 3, …
$ beds <dbl> 2, 1, 4, 1, 1, 1, 1, 5, 3, 5, 4, 2, 3, 1, 6, 5, 5, 5, 6, 4, 4, 3, 2, 6, 5, 1, 1, 2, 2, 2, 2, 4, 2…
$ amenities_count <dbl> 41, 31, 8, 49, 23, 40, 20, 14, 25, 49, 48, 9, 39, 37, 35, 19, 27, 11, 16, 38, 28, 48, 8, 30, 22, …
$ minimum_nights <dbl> 2, 2, 1, 9, 5, 3, 8, 1, 7, 8, 2, 5, 3, 6, 6, 4, 5, 8, 8, NA, 3, 6, 7, 3, 2, 8, 6, 6, 9, 3, 6, 8, …
$ maximum_nights <dbl> 503, 83, 256, 714, 55, 869, 486, 817, 697, 1101, 328, 300, 984, 312, 1051, 411, 789, 398, 1111, 5…
$ number_of_reviews <dbl> 10, 7, 9, 8, 16, 9, 10, 8, 8, 7, 8, 7, 7, 10, 11, 12, 8, 3, 12, 9, 9, 10, 15, 16, 14, 10, 11, 15,…
$ review_scores_rating <dbl> 2, 1, 2, 5, 2, 4, 1, 3, 5, 1, 2, 4, 4, 3, 1, 2, 3, 3, 1, 2, 3, 1, 1, 4, 3, 5, 3, NA, 5, 2, 1, 1, …
$ reviews_per_month <dbl> 2.8, 4.9, 4.1, 2.6, 3.8, 1.6, 5.0, 3.7, 4.1, 4.9, 3.0, 1.8, 4.7, 3.4, 1.2, 3.7, 3.6, 2.2, 3.3, 4.…
$ availability_30 <dbl> 23, 19, 2, 8, 5, 0, 24, 9, 28, 2, 28, 15, 5, 26, 28, 26, 29, 2, 23, 0, 11, 3, 28, 16, 26, 22, 16,…
$ availability_60 <dbl> 56, 57, 37, 7, 14, 22, 45, 58, 26, 37, 19, 49, 49, 9, 40, 51, 2, 53, 54, 31, 27, 49, 52, 23, 55, …
$ availability_90 <dbl> 41, 87, 65, 34, 57, 47, 83, 77, 78, 12, 21, 77, 8, 71, 23, 50, 9, 89, 0, 23, 70, 44, 28, 26, 5, 8…
$ availability_365 <dbl> 18, 196, 278, 263, 30, 44, 185, 230, 336, 18, 221, 60, 52, 72, 233, 121, 62, 242, 217, 249, 80, 2…
$ instant_bookable <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0…
$ host_identity_verified <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0…
$ host_since <dbl> 1652, 1126, 1728, 3264, 902, 2773, 771, 424, 1952, 552, 1318, 2494, 1507, 1014, 2518, 3486, 3567,…
$ cancellation_policy <chr> "flexible", "flexible", "strict", "strict_14_with_grace_period", "strict_14_with_grace_period", "…
$ latitude <dbl> 40.72955, 40.62440, 40.59282, 40.71197, 40.85087, 40.85365, 40.88695, 40.62214, 40.87585, 40.5826…
$ longitude <dbl> -73.93356, -73.88089, -73.74172, -73.91624, -73.72905, -73.86373, -73.76245, -73.84639, -73.86990…
$ guests_included <dbl> 4, 4, 4, 1, 4, 1, 2, 4, 4, 2, 1, 4, 2, 2, 4, 1, 4, 4, 1, 1, 1, 4, 3, 4, 3, 3, 1, 3, 2, 2, 3, 1, 1…
$ extra_people <dbl> 3, 41, 5, 6, 6, 31, 19, 34, 29, 9, 47, 11, 0, 21, 7, 29, 42, 17, 24, 14, 10, 3, 0, 4, 3, 49, 6, 3…
$ price <dbl> 327, 335, 213, 399, 301, 319, 272, 344, 307, 341, 397, 264, 321, 408, 504, 285, 346, 293, 322, 28…
# Calculate the percentage of missing data for each column
missing_data <- main_uncleaned_data %>%
summarise(across(everything(), ~ mean(is.na(.)) * 100)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Percentage")
# Plot the missing data percentage
ggplot(missing_data, aes(x = reorder(Variable, -Missing_Percentage), y = Missing_Percentage)) +
geom_bar(stat = "identity", fill = "#Ffb400") +
labs(title = "Percentage of Missing Data by Column", x = "Variable", y = "Missing Percentage (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
coord_flip()
NA
which(is.na(main_uncleaned_data))
[1] 388 420 540 844 967 1123 1399 1874 1894 2006 2132 2221 2356 2657 2990 3121 3247 3746 3920 3945
[21] 4052 4121 4598 4669 4956 5227 5268 5587 5935 5950 6122 6232 6483 6546 6617 7090 7245 7375 7416 7824
[41] 8387 8642 8662 9032 9063 9157 9158 9201 9471 9562 10128 10181 10347 10961 11001 11193 11278 11331 11575 12046
[61] 12321 12410 12567 12727 12839 13294 13502 13607 13921 14007 14331 14338 14441 14609 15039 15219 15595 15977 16001 16095
[81] 16291 16495 16640 17123 17126 17297 17333 17455 18134 18142 18242 18473 19057 19060 19129 19456 19600 19630 19840 20155
[101] 20564 20658 20785 21059 21170 21478 21515 21551 21746 21862 22474 22599 22819 23092 23109 23548 23934 24002 24087 24147
[121] 24231 24375 24628 24656 24692 25018
col_missing <- colSums(is.na(main_uncleaned_data))
col_missing
property_type room_type accommodates bathrooms bedrooms
5 5 5 5 5
beds amenities_count minimum_nights maximum_nights number_of_reviews
5 5 5 5 5
review_scores_rating reviews_per_month availability_30 availability_60 availability_90
5 5 5 5 5
availability_365 instant_bookable host_identity_verified host_since cancellation_policy
5 5 5 5 5
latitude longitude guests_included extra_people price
5 5 5 6 5
dropped_na <- main_uncleaned_data %>% drop_na()
which(is.na(dropped_na)) # just checking if the cleaning worked or not
integer(0)
Q1 <- quantile(dropped_na$bedrooms, 0.25, na.rm = TRUE)
Q3 <- quantile(dropped_na$bedrooms, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
cleaned_airbnb_data <- dropped_na %>%
filter(bedrooms >= (Q1 - 1.5 * IQR) & bedrooms <= (Q3 + 1.5 * IQR))
glimpse(cleaned_airbnb_data)
Rows: 886
Columns: 25
$ property_type <chr> "Apartment", "Townhouse", "Apartment", "Apartment", "House", "Loft", "House", "House", "Apartment…
$ room_type <chr> "Entire home/apt", "Entire home/apt", "Entire home/apt", "Entire home/apt", "Entire home/apt", "P…
$ accommodates <dbl> 6, 2, 2, 4, 8, 5, 3, 2, 5, 3, 5, 9, 4, 4, 4, 1, 1, 2, 8, 8, 2, 5, 3, 3, 4, 3, 3, 2, 5, 2, 7, 4, 7…
$ bathrooms <dbl> 1, 4, 1, 4, 1, 3, 1, 3, 4, 2, 4, 1, 4, 4, 2, 2, 2, 1, 3, 3, 4, 1, 1, 4, 4, 1, 1, 2, 1, 2, 4, 4, 2…
$ bedrooms <dbl> 3, 1, 4, 2, 2, 2, 1, 5, 2, 4, 3, 3, 1, 5, 4, 4, 5, 5, 5, 3, 3, 5, 4, 1, 2, 2, 2, 2, 3, 3, 1, 5, 2…
$ beds <dbl> 2, 1, 4, 1, 1, 1, 1, 5, 3, 5, 4, 3, 1, 6, 5, 5, 5, 6, 4, 3, 2, 6, 5, 1, 1, 2, 2, 2, 4, 2, 1, 5, 2…
$ amenities_count <dbl> 41, 31, 8, 49, 23, 40, 20, 14, 25, 49, 48, 39, 37, 35, 19, 27, 11, 16, 28, 48, 8, 30, 22, 24, 39,…
$ minimum_nights <dbl> 2, 2, 1, 9, 5, 3, 8, 1, 7, 8, 2, 3, 6, 6, 4, 5, 8, 8, 3, 6, 7, 3, 2, 8, 6, 9, 3, 6, 8, 8, 7, 8, 8…
$ maximum_nights <dbl> 503, 83, 256, 714, 55, 869, 486, 817, 697, 1101, 328, 984, 312, 1051, 411, 789, 398, 1111, 184, 3…
$ number_of_reviews <dbl> 10, 7, 9, 8, 16, 9, 10, 8, 8, 7, 8, 7, 10, 11, 12, 8, 3, 12, 9, 10, 15, 16, 14, 10, 11, 8, 9, 13,…
$ review_scores_rating <dbl> 2, 1, 2, 5, 2, 4, 1, 3, 5, 1, 2, 4, 3, 1, 2, 3, 3, 1, 3, 1, 1, 4, 3, 5, 3, 5, 2, 1, 1, 2, 4, 4, 1…
$ reviews_per_month <dbl> 2.8, 4.9, 4.1, 2.6, 3.8, 1.6, 5.0, 3.7, 4.1, 4.9, 3.0, 4.7, 3.4, 1.2, 3.7, 3.6, 2.2, 3.3, 0.6, 2.…
$ availability_30 <dbl> 23, 19, 2, 8, 5, 0, 24, 9, 28, 2, 28, 5, 26, 28, 26, 29, 2, 23, 11, 3, 28, 16, 26, 22, 16, 18, 25…
$ availability_60 <dbl> 56, 57, 37, 7, 14, 22, 45, 58, 26, 37, 19, 49, 9, 40, 51, 2, 53, 54, 27, 49, 52, 23, 55, 4, 9, 47…
$ availability_90 <dbl> 41, 87, 65, 34, 57, 47, 83, 77, 78, 12, 21, 8, 71, 23, 50, 9, 89, 0, 70, 44, 28, 26, 5, 86, 82, 2…
$ availability_365 <dbl> 18, 196, 278, 263, 30, 44, 185, 230, 336, 18, 221, 52, 72, 233, 121, 62, 242, 217, 80, 201, 352, …
$ instant_bookable <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0…
$ host_identity_verified <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1…
$ host_since <dbl> 1652, 1126, 1728, 3264, 902, 2773, 771, 424, 1952, 552, 1318, 1507, 1014, 2518, 3486, 3567, 1618,…
$ cancellation_policy <chr> "flexible", "flexible", "strict", "strict_14_with_grace_period", "strict_14_with_grace_period", "…
$ latitude <dbl> 40.72955, 40.62440, 40.59282, 40.71197, 40.85087, 40.85365, 40.88695, 40.62214, 40.87585, 40.5826…
$ longitude <dbl> -73.93356, -73.88089, -73.74172, -73.91624, -73.72905, -73.86373, -73.76245, -73.84639, -73.86990…
$ guests_included <dbl> 4, 4, 4, 1, 4, 1, 2, 4, 4, 2, 1, 2, 2, 4, 1, 4, 4, 1, 1, 4, 3, 4, 3, 3, 1, 2, 2, 3, 1, 1, 1, 4, 3…
$ extra_people <dbl> 3, 41, 5, 6, 6, 31, 19, 34, 29, 9, 47, 0, 21, 7, 29, 42, 17, 24, 10, 3, 0, 4, 3, 49, 6, 21, 23, 4…
$ price <dbl> 327, 335, 213, 399, 301, 319, 272, 344, 307, 341, 397, 321, 408, 504, 285, 346, 293, 322, 342, 46…
numeric_vars <- c("bathrooms", "bedrooms", "beds", "amenities_count", "minimum_nights", "maximum_nights", "number_of_reviews", "review_scores_rating", "price")
categorical_vars <- c("property_type", "room_type", "cancellation_policy", "host_identity_verifed")
par(mfrow = c(2, 4)) # Set up plotting layout
# Create a histogram for each variable
lapply(numeric_vars, function(var) {
hist(dropped_na[[var]],
main = paste("Histogram of", var),
xlab = var,
col = "#FFB400",
border = "white"
)
})
[[1]]
$breaks
[1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0
$counts
[1] 211 0 0 0 233 0 0 0 0 202 0 0 0 0 245
$density
[1] 1.184063 0.000000 0.000000 0.000000 1.307520 0.000000 0.000000 0.000000 0.000000 1.133558 0.000000 0.000000 0.000000 0.000000
[15] 1.374860
$mids
[1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7 3.9
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[2]]
$breaks
[1] 0 10 20 30 40 50 60 70 80 90 100
$counts
[1] 886 0 1 0 2 0 1 0 0 1
$density
[1] 0.0994388328 0.0000000000 0.0001122334 0.0000000000 0.0002244669 0.0000000000 0.0001122334 0.0000000000 0.0000000000
[10] 0.0001122334
$mids
[1] 5 15 25 35 45 55 65 75 85 95
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[3]]
$breaks
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0
$counts
[1] 183 179 0 163 0 180 0 129 0 57
$density
[1] 0.4107744 0.4017957 0.0000000 0.3658810 0.0000000 0.4040404 0.0000000 0.2895623 0.0000000 0.1279461
$mids
[1] 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[4]]
$breaks
[1] 5 10 15 20 25 30 35 40 45 50
$counts
[1] 110 98 78 117 85 102 110 97 94
$density
[1] 0.02469136 0.02199776 0.01750842 0.02626263 0.01907969 0.02289562 0.02469136 0.02177329 0.02109989
$mids
[1] 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[5]]
$breaks
[1] 1 2 3 4 5 6 7 8 9
$counts
[1] 193 122 86 95 83 109 107 96
$density
[1] 0.21661055 0.13692480 0.09652076 0.10662177 0.09315376 0.12233446 0.12008979 0.10774411
$mids
[1] 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[6]]
$breaks
[1] 0 100 200 300 400 500 600 700 800 900 1000 1100 1200
$counts
[1] 56 75 85 92 71 68 73 84 93 75 91 28
$density
[1] 0.0006285073 0.0008417508 0.0009539843 0.0010325477 0.0007968575 0.0007631874 0.0008193042 0.0009427609 0.0010437710
[10] 0.0008417508 0.0010213244 0.0003142536
$mids
[1] 50 150 250 350 450 550 650 750 850 950 1050 1150
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[7]]
$breaks
[1] 0 2 4 6 8 10 12 14 16 18 20 22 24 26
$counts
[1] 5 21 90 186 239 180 97 51 13 6 1 1 1
$density
[1] 0.0028058361 0.0117845118 0.0505050505 0.1043771044 0.1341189675 0.1010101010 0.0544332211 0.0286195286 0.0072951740
[10] 0.0033670034 0.0005611672 0.0005611672 0.0005611672
$mids
[1] 1 3 5 7 9 11 13 15 17 19 21 23 25
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[8]]
$breaks
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
$counts
[1] 124 236 0 202 0 216 0 113
$density
[1] 0.2783389 0.5297419 0.0000000 0.4534231 0.0000000 0.4848485 0.0000000 0.2536476
$mids
[1] 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[9]]
$breaks
[1] -400 -300 -200 -100 0 100 200 300 400 500 600
$counts
[1] 1 3 1 0 3 39 311 423 104 6
$density
[1] 1.122334e-05 3.367003e-05 1.122334e-05 0.000000e+00 3.367003e-05 4.377104e-04 3.490460e-03 4.747475e-03 1.167228e-03
[10] 6.734007e-05
$mids
[1] -350 -250 -150 -50 50 150 250 350 450 550
$xname
[1] "dropped_na[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
par(mfrow = c(2, 4)) # 2 rows, 4 columns
# Create a boxplot for each variable
lapply(numeric_vars, function(var) {
boxplot(dropped_na[[var]],
main = paste("Boxplot of", var),
ylab = var,
col = "#00A699")
})
[[1]]
[[1]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 4
[[1]]$n
[1] 891
[[1]]$conf
[,1]
[1,] 2.894136
[2,] 3.105864
[[1]]$out
numeric(0)
[[1]]$group
numeric(0)
[[1]]$names
[1] ""
[[2]]
[[2]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 5
[[2]]$n
[1] 891
[[2]]$conf
[,1]
[1,] 2.894136
[2,] 3.105864
[[2]]$out
[1] 44 92 70 28 45
[[2]]$group
[1] 1 1 1 1 1
[[2]]$names
[1] ""
[[3]]
[[3]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 6
[[3]]$n
[1] 891
[[3]]$conf
[,1]
[1,] 2.894136
[2,] 3.105864
[[3]]$out
numeric(0)
[[3]]$group
numeric(0)
[[3]]$names
[1] ""
[[4]]
[[4]]$stats
[,1]
[1,] 5
[2,] 16
[3,] 28
[4,] 39
[5,] 49
[[4]]$n
[1] 891
[[4]]$conf
[,1]
[1,] 26.78256
[2,] 29.21744
[[4]]$out
numeric(0)
[[4]]$group
numeric(0)
[[4]]$names
[1] ""
[[5]]
[[5]]$stats
[,1]
[1,] 1
[2,] 3
[3,] 5
[4,] 7
[5,] 9
[[5]]$n
[1] 891
[[5]]$conf
[,1]
[1,] 4.788272
[2,] 5.211728
[[5]]$out
numeric(0)
[[5]]$group
numeric(0)
[[5]]$names
[1] ""
[[6]]
[[6]]$stats
[,1]
[1,] 30.0
[2,] 307.0
[3,] 598.0
[4,] 868.5
[5,] 1124.0
[[6]]$n
[1] 891
[[6]]$conf
[,1]
[1,] 568.2787
[2,] 627.7213
[[6]]$out
numeric(0)
[[6]]$group
numeric(0)
[[6]]$names
[1] ""
[[7]]
[[7]]$stats
[,1]
[1,] 2
[2,] 8
[3,] 10
[4,] 12
[5,] 18
[[7]]$n
[1] 891
[[7]]$conf
[,1]
[1,] 9.788272
[2,] 10.211728
[[7]]$out
[1] 19 25 19 1 24 1 19 19 21 20 20
[[7]]$group
[1] 1 1 1 1 1 1 1 1 1 1 1
[[7]]$names
[1] ""
[[8]]
[[8]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 5
[[8]]$n
[1] 891
[[8]]$conf
[,1]
[1,] 2.894136
[2,] 3.105864
[[8]]$out
numeric(0)
[[8]]$group
numeric(0)
[[8]]$names
[1] ""
[[9]]
[[9]]$stats
[,1]
[1,] 133.0
[2,] 270.5
[3,] 317.0
[4,] 363.0
[5,] 493.0
[[9]]$n
[1] 891
[[9]]$conf
[,1]
[1,] 312.1038
[2,] 321.8962
[[9]]$out
[1] 504 -304 99 515 97 -286 544 -156 566 -265 99 503 544 -258
[[9]]$group
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[[9]]$names
[1] ""
There are negative price outliers, so we’ll filter that out
# removing negative price values
cleaned_airbnb_data <- cleaned_airbnb_data %>%
filter(price > 0)
par(mfrow = c(2, 4)) # Set up plotting layout
# Create a histogram for each variable
lapply(numeric_vars, function(var) {
hist(cleaned_airbnb_data[[var]],
main = paste("Histogram of", var),
xlab = var,
col = "#FFB400",
border = "white"
)
})
[[1]]
$breaks
[1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0
$counts
[1] 207 0 0 0 229 0 0 0 0 202 0 0 0 0 243
$density
[1] 1.174801 0.000000 0.000000 0.000000 1.299659 0.000000 0.000000 0.000000 0.000000 1.146425 0.000000 0.000000 0.000000 0.000000
[15] 1.379115
$mids
[1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7 3.9
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[2]]
$breaks
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
$counts
[1] 182 176 0 173 0 166 0 184
$density
[1] 0.4131669 0.3995460 0.0000000 0.3927355 0.0000000 0.3768445 0.0000000 0.4177072
$mids
[1] 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[3]]
$breaks
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0
$counts
[1] 180 178 0 160 0 179 0 127 0 57
$density
[1] 0.4086266 0.4040863 0.0000000 0.3632236 0.0000000 0.4063564 0.0000000 0.2883087 0.0000000 0.1293984
$mids
[1] 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[4]]
$breaks
[1] 5 10 15 20 25 30 35 40 45 50
$counts
[1] 107 98 78 115 84 102 110 96 91
$density
[1] 0.02429058 0.02224745 0.01770715 0.02610670 0.01906924 0.02315551 0.02497162 0.02179342 0.02065834
$mids
[1] 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[5]]
$breaks
[1] 1 2 3 4 5 6 7 8 9
$counts
[1] 191 120 86 94 81 107 107 95
$density
[1] 0.21679909 0.13620885 0.09761635 0.10669694 0.09194098 0.12145289 0.12145289 0.10783201
$mids
[1] 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[6]]
$breaks
[1] 0 100 200 300 400 500 600 700 800 900 1000 1100 1200
$counts
[1] 54 74 84 92 71 66 73 84 92 73 91 27
$density
[1] 0.0006129398 0.0008399546 0.0009534620 0.0010442679 0.0008059024 0.0007491487 0.0008286039 0.0009534620 0.0010442679
[10] 0.0008286039 0.0010329171 0.0003064699
$mids
[1] 50 150 250 350 450 550 650 750 850 950 1050 1150
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[7]]
$breaks
[1] 0 2 4 6 8 10 12 14 16 18 20 22 24 26
$counts
[1] 5 21 89 183 236 178 96 51 13 6 1 1 1
$density
[1] 0.0028376844 0.0119182747 0.0505107832 0.1038592509 0.1339387060 0.1010215664 0.0544835414 0.0289443814 0.0073779796
[10] 0.0034052213 0.0005675369 0.0005675369 0.0005675369
$mids
[1] 1 3 5 7 9 11 13 15 17 19 21 23 25
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[8]]
$breaks
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
$counts
[1] 123 232 0 199 0 214 0 113
$density
[1] 0.2792281 0.5266742 0.0000000 0.4517594 0.0000000 0.4858116 0.0000000 0.2565267
$mids
[1] 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[9]]
$breaks
[1] 50 100 150 200 250 300 350 400 450 500 550 600
$counts
[1] 3 3 36 102 206 258 163 76 28 5 1
$density
[1] 6.810443e-05 6.810443e-05 8.172531e-04 2.315551e-03 4.676504e-03 5.856981e-03 3.700341e-03 1.725312e-03 6.356413e-04
[10] 1.135074e-04 2.270148e-05
$mids
[1] 75 125 175 225 275 325 375 425 475 525 575
$xname
[1] "cleaned_airbnb_data[[var]]"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
par(mfrow = c(2, 4)) # 2 rows, 4 columns
# Create a boxplot for each variable
lapply(numeric_vars, function(var) {
boxplot(cleaned_airbnb_data[[var]],
main = paste("Boxplot of", var),
ylab = var,
col = "#00A699")
})
[[1]]
[[1]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 4
[[1]]$n
[1] 881
[[1]]$conf
[,1]
[1,] 2.893537
[2,] 3.106463
[[1]]$out
numeric(0)
[[1]]$group
numeric(0)
[[1]]$names
[1] ""
[[2]]
[[2]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 5
[[2]]$n
[1] 881
[[2]]$conf
[,1]
[1,] 2.893537
[2,] 3.106463
[[2]]$out
numeric(0)
[[2]]$group
numeric(0)
[[2]]$names
[1] ""
[[3]]
[[3]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 6
[[3]]$n
[1] 881
[[3]]$conf
[,1]
[1,] 2.893537
[2,] 3.106463
[[3]]$out
numeric(0)
[[3]]$group
numeric(0)
[[3]]$names
[1] ""
[[4]]
[[4]]$stats
[,1]
[1,] 5
[2,] 16
[3,] 28
[4,] 39
[5,] 49
[[4]]$n
[1] 881
[[4]]$conf
[,1]
[1,] 26.77567
[2,] 29.22433
[[4]]$out
numeric(0)
[[4]]$group
numeric(0)
[[4]]$names
[1] ""
[[5]]
[[5]]$stats
[,1]
[1,] 1
[2,] 3
[3,] 5
[4,] 7
[5,] 9
[[5]]$n
[1] 881
[[5]]$conf
[,1]
[1,] 4.787074
[2,] 5.212926
[[5]]$out
numeric(0)
[[5]]$group
numeric(0)
[[5]]$names
[1] ""
[[6]]
[[6]]$stats
[,1]
[1,] 30
[2,] 308
[3,] 599
[4,] 868
[5,] 1124
[[6]]$n
[1] 881
[[6]]$conf
[,1]
[1,] 569.1903
[2,] 628.8097
[[6]]$out
numeric(0)
[[6]]$group
numeric(0)
[[6]]$names
[1] ""
[[7]]
[[7]]$stats
[,1]
[1,] 2
[2,] 8
[3,] 10
[4,] 12
[5,] 18
[[7]]$n
[1] 881
[[7]]$conf
[,1]
[1,] 9.787074
[2,] 10.212926
[[7]]$out
[1] 19 25 19 1 24 1 19 19 21 20 20
[[7]]$group
[1] 1 1 1 1 1 1 1 1 1 1 1
[[7]]$names
[1] ""
[[8]]
[[8]]$stats
[,1]
[1,] 1
[2,] 2
[3,] 3
[4,] 4
[5,] 5
[[8]]$n
[1] 881
[[8]]$conf
[,1]
[1,] 2.893537
[2,] 3.106463
[[8]]$out
numeric(0)
[[8]]$group
numeric(0)
[[8]]$names
[1] ""
[[9]]
[[9]]$stats
[,1]
[1,] 136
[2,] 272
[3,] 317
[4,] 364
[5,] 493
[[9]]$n
[1] 881
[[9]]$conf
[,1]
[1,] 312.1027
[2,] 321.8973
[[9]]$out
[1] 504 99 515 97 544 566 99 503 544 133
[[9]]$group
[1] 1 1 1 1 1 1 1 1 1 1
[[9]]$names
[1] ""
# Set up margins to accommodate rotated labels
par(mfrow=c(1, length(categorical_vars)), mar=c(6, 5, 3, 2))
# Create the bar plots with rotated labels
for (i in 1:length(categorical_vars)) {
counts <- table(cleaned_airbnb_data[[categorical_vars[i]]])
if (length(counts) == 0 || all(counts == 0)) {
next
}
barplot(counts, main=categorical_vars[i], col='#ff5a5f', border='black', ylim=c(0, max(counts) + 10), las = 2)
}
library(corrplot)
corrplot 0.95 loaded
selected_df_for_modelling <- select(cleaned_airbnb_data, -latitude, -longitude, -cancellation_policy, -property_type, -room_type)
correlation_matrix <- cor(selected_df_for_modelling)
# Draw correlation heatmap
corrplot(correlation_matrix,
method = "circle", # Use circles to represent correlations
type = "full", # Display only upper triangle
tl.col = "darkblue", # Set font color to dark blue
tl.srt = 45, # Rotate labels for better readability
tl.cex = 0.8, # Adjust font size of labels
cl.cex = 0.8, # Adjust font size of color legend
pch.cex = 1.2) # Set size for the correlation coefficient numbers
price_col_vars <- c("accommodates", "bathrooms", "bedrooms", "beds", "amenities_count")
# Check the class of each variable in numeric_vars
sapply(cleaned_airbnb_data [, price_col_vars], class)
accommodates bathrooms bedrooms beds amenities_count
"numeric" "numeric" "numeric" "numeric" "numeric"
cleaned_airbnb_data[, price_col_vars] <- lapply(cleaned_airbnb_data[, price_col_vars], as.numeric)
for (var in price_col_vars) {
plot(
as.numeric(cleaned_airbnb_data[[var]]),
as.numeric(cleaned_airbnb_data$price),
main = paste("Relationship between", var, "and Price"),
xlab = var,
ylab = "Price",
col = "#00A699",
pch = 19
)
}
library(ggplot2)
# Scatter plot with regression line
ggplot(cleaned_airbnb_data, aes(x = amenities_count , y = price)) +
geom_point(color = "#00A699", alpha = 0.6) +
geom_smooth(method = "lm", color = "#FF5A5F") +
labs(title = "Price vs Number of Amenities", x = "Number of Amenities", y = "Price") +
theme_minimal()
# Scatter plot with regression line
ggplot(cleaned_airbnb_data, aes(x = number_of_reviews , y = price)) +
geom_point(color = "#00A699", alpha = 0.6) +
geom_smooth(method = "lm", color = "#FF5A5F") +
labs(title = "Price vs Number of Reviews", x = "Number of Reviews", y = "Price") +
theme_minimal()
ggplot(cleaned_airbnb_data, aes(x = bedrooms, y = beds)) +
geom_area(color = "#ffb400", fill = "#ffb400") +
labs(title = "Area Chart for Beds and Bedrooms",
x = "Bedrooms",
y = "Beds") +
theme_minimal()
# Density plot of price by room type
ggplot(cleaned_airbnb_data, aes(x = price, fill = room_type,)) +
geom_density(alpha = 0.5) +
labs(title = "Price Density by Room Type", x = "Price", y = "Density") +
theme_minimal()
world_map <- map_data("state")
ggplot() +
geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "white") +
geom_point(data = cleaned_airbnb_data, aes(x = longitude, y = latitude, size = review_scores_rating, color = price))
ggplot(cleaned_airbnb_data, aes(x = room_type, fill = factor(review_scores_rating))) +
geom_bar(position = "fill") +
labs(
title = "Distribution of Review Score Rating by Room Type and Property Type",
x = "Room Type",
y = "Count",
fill = "Review Score Rating"
) +
scale_fill_manual(values = airbnb_palette) +
theme_minimal()
ggplot(cleaned_airbnb_data, aes(x = cancellation_policy , fill = factor(review_scores_rating))) +
geom_bar(position = "fill") +
labs(
title = "Distribution of Review Score Rating by Cancellation Policy",
x = "Cancellation Policy",
y = "Count",
fill = "Review Score Rating"
) +
scale_fill_manual(values = airbnb_palette) +
theme_minimal()
# Create the plot
ggplot(data = cleaned_airbnb_data, mapping = aes(x = amenities_count, y = price, color = room_type)) +
geom_point(size = 2) +
geom_smooth(mapping = aes(color = room_type), se = FALSE) +
scale_color_manual(values = c("Entire home/apt" = "#FF5A5F",
"Private room" = "#00A699",
"Shared room" = "#ffb400")) +
theme_minimal() +
labs(title = "Price vs. Amenities Count by Room Type",
x = "Amenities Count",
y = "Price",
color = "Room Type")
# Boxplot for Price by Property Type
ggplot(cleaned_airbnb_data, aes(x = property_type, y = price)) +
geom_boxplot(fill = "#ffb400", color = "black") +
labs(title = "Price by Property Type", x = "Property Type", y = "Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Boxplot for Price by Room Type
ggplot(cleaned_airbnb_data, aes(x = room_type, y = price)) +
geom_boxplot(fill = "#ffb400", color = "black") +
labs(title = "Price by Property Type", x = "Property Type", y = "Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Boxplot for host identity and review rating
ggplot(cleaned_airbnb_data, aes(x = factor(host_identity_verified), y = review_scores_rating)) +
geom_boxplot(fill = "#ff5a5f", color = "black") +
labs(title = "Rating by Host Veification", x = "Host Verification Status", y = "Rating") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))